library(tidyverse)
sign <- read_csv2("refactored_table.csv")
sign %>%
filter(non_iconic == "0") %>%
select(4, 6, 1, 2, 8:11) %>%
slice(-c(32, 1164)) %>% # убиваю два неиконичных набоюления, они будут ломать MCA
mutate_all(funs(factor)) ->
sign
sign
summary(sign)
## form-image_assocaition_pattern languages
## object :876 Turkish Sign Language : 129
## tracing :512 Latvian Sign Language : 117
## handling:324 Estonian Sign Language : 112
## contour :168 Italian Sign Language : 112
## ? : 3 Lithuanian Sign Language: 111
## hangling: 1 Polish Sign Language : 111
## (Other) : 3 (Other) :1195
## word semantic_field localization personification
## underground: 39 animals :298 0:1253 0:1503
## knife : 38 clothes :188 1: 634 1: 384
## fork : 36 food :220
## banana : 35 house :216
## bread : 34 instruments:255
## helicopter : 34 nature :403
## (Other) :1671 transport :307
## action parts-wholes
## 0:958 0:1482
## 1:929 1: 405
##
##
##
##
##
Проведем MCA. Какой процент объясненной дисперсии содержат новые компоненты?
sign_mca <- MASS::mca(sign[,-c(1:3)], nf = 8)
data_frame(pca = factor(paste0("PC", seq_along(sign_mca$d)),
levels = paste0("PC", seq_along(sign_mca$d))),
eigenvalue = sign_mca$d^2,
perc_var = round(sign_mca$d^2/sum(sign_mca$d^2), 3),
cum_perc_var = cumsum(perc_var)) %>%
arrange(desc(perc_var)) ->
sign_mca_results
sign_mca_results %>%
gather(method, value, perc_var:cum_perc_var) %>%
mutate(method = factor(method, levels = c("perc_var", "cum_perc_var"))) %>%
ggplot(aes(pca, value, label = round(value, 2)))+
geom_col(fill = "lightblue")+
geom_text(aes(y = value + 0.05))+
facet_grid(~method, scales = "free_x")+
theme_bw()+
labs(x = "", y = "",
title = "Impact of each PC in MCA model")
Первые две переменные описывают 48% дисперсии. Для MCA это высокий процент. Посмотрим, как распеделены исследуемые единицы в новом пространстве. Построим отдельный график для переменных:
# data frame for ggplot
cats = apply(sign[,-c(1:3)], 2, function(x) nlevels(as.factor(x)))
data.frame(sign_mca$cs, Variable = rep(names(cats), cats)) %>%
ggplot(aes(x = X1, y = X2, label = rownames(.))) +
geom_hline(yintercept = 0, colour = "gray70") +
geom_vline(xintercept = 0, colour = "gray70") +
geom_text(aes(colour = Variable)) +
ggtitle("MCA plot of variables using R package MASS")+
theme_bw()+
scale_x_continuous(limits = c(-0.015, 0.015))
Нам потом придется придумать, почему те или иные переменные группируются рядом.
Следующий график с наблюдениями, раскрасим по паттерну tracing-handling:
# data frame for ggplot
data.frame(sign_mca$rs, word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`) %>%
ggplot(aes(x = X1, y = X2, label = word, color = type)) +
geom_hline(yintercept = 0, colour = "gray70") +
geom_vline(xintercept = 0, colour = "gray70") +
geom_point() +
stat_ellipse()+
ggtitle("MCA plot of observations using R package MASS")+
theme_bw()
Следующий график с наблюдениями, раскрасим по языку:
# data frame for ggplot
data.frame(sign_mca$rs, word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`) %>%
ggplot(aes(x = X1, y = X2, label = word, color = language)) +
geom_hline(yintercept = 0, colour = "gray70") +
geom_vline(xintercept = 0, colour = "gray70") +
geom_point() +
stat_ellipse()+
ggtitle("MCA plot of observations using R package MASS")+
theme_bw()
Следующий график с наблюдениями, посмотрим на слова:
# data frame for ggplot
data.frame(sign_mca$rs, word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`) %>%
ggplot(aes(x = X1, y = X2, label = word, color = type)) +
geom_hline(yintercept = 0, colour = "gray70") +
geom_vline(xintercept = 0, colour = "gray70") +
geom_text() +
ggtitle("MCA plot of observations using R package MASS")+
theme_bw() ->
plot
library(plotly)
ggplotly(plot)
Они конечно, налезают друг на друга…
Раскрасим по семантике:
data.frame(sign_mca$rs, word = sign$word, language = sign$languages, type = sign$`form-image_assocaition_pattern`, semantic = sign$semantic_field) %>%
ggplot(aes(x = X1, y = X2, label = word, color = semantic)) +
geom_hline(yintercept = 0, colour = "gray70") +
geom_vline(xintercept = 0, colour = "gray70") +
geom_point() +
stat_ellipse()+
ggtitle("MCA plot of observations using R package MASS")+
theme_bw()
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure